1 Prepare Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/Jo/OneDrive/1_Hertie Studies/Thesis/Hertie-Thesis-Mehler
library(stats)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(corrplot)
## corrplot 0.92 loaded
library(descr)

data <- read_csv(here("data/data_combined.csv"))
## Rows: 1019 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): ResponseId, academic_status, educ_cat, gender, age_cat, polinteres...
## dbl (12): age, age10, polinterest, empathy_pc, exp_hate_speech, exp_hostile_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# select indicators
data_indicators <- data %>% 
  select(cluster, text_length, readability_score, leftright_pred_score) %>% # add cluster/type indicator later
  drop_na() %>% 
  filter(text_length < 2500, readability_score < 40) # remove 2 extreme outliers

# select controls (categorical)
controls_cat <- data %>% select(academic_status, gender, age_cat, minority_cat, polinterest_cat_3, empathy_pc_cat, exp_hate_speech_cat, exp_hostile_engagement_cat) %>% drop_na()

# select controls (numerical)
controls_num <- data %>% select(academic_status, age10, minority, polinterest, empathy_pc, exp_hate_speech, exp_hostile_engagement) %>% drop_na()


# # if needed, look at the original huge sample controls instead of only my combined dataset
# controls <- read_csv(here("data/controls.csv"))
# # chose all controls only as numerics in order to check correlations?≤
# data_controls <- controls %>% select(-ResponseId, -gender, -age, -age_cat, -polinterest_cat_3, -empathy_pc_cat, -exp_hate_speech_cat, # -exp_hostile_engagement_cat) %>% drop_na()

2 Correlations between Indicators

2.1 Scatter Plots and Correlation

# select indicators in a different order (continous variables first)
indicators <- c("text_length", "readability_score", "leftright_pred_score", "cluster")

ggpairs(data_indicators, columns = indicators)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.2 Correlation Matrix (without significance)

# chose only numeric indicators
data_indicators_num <- data_indicators %>% 
  select(text_length, readability_score, leftright_pred_score)

cor_matrix <- cor(data_indicators_num, method = "pearson")
cor_matrix
##                      text_length readability_score leftright_pred_score
## text_length            1.0000000        0.42740196           0.13190421
## readability_score      0.4274020        1.00000000           0.08203617
## leftright_pred_score   0.1319042        0.08203617           1.00000000

2.3 Correlation Plot

cor_matrix %>% corrplot(method = "color", type = "lower", tl.cex = 1, tl.col = "black", addCoef.col = "black")

3 Correlations between Controls

3.1 Correlation Matrix (without significance)

controls_num_selection <- controls_num %>% select(-academic_status)

cor_matrix <- cor(controls_num_selection, method = "spearman")
cor_matrix
##                              age10     minority polinterest   empathy_pc
## age10                   1.00000000 -0.107416278  0.19565209  0.110195843
## minority               -0.10741628  1.000000000 -0.03657133  0.009031454
## polinterest             0.19565209 -0.036571335  1.00000000 -0.080177227
## empathy_pc              0.11019584  0.009031454 -0.08017723  1.000000000
## exp_hate_speech        -0.14542495  0.119937419  0.09873072 -0.123841485
## exp_hostile_engagement  0.09021549  0.117348119  0.28848369  0.063991034
##                        exp_hate_speech exp_hostile_engagement
## age10                      -0.14542495             0.09021549
## minority                    0.11993742             0.11734812
## polinterest                 0.09873072             0.28848369
## empathy_pc                 -0.12384148             0.06399103
## exp_hate_speech             1.00000000             0.26899096
## exp_hostile_engagement      0.26899096             1.00000000

3.2 Correlation Plot

cor_matrix %>% corrplot(method = "color", type = "lower", tl.cex = 1, tl.col = "black", addCoef.col = "black")

3.3 Try Outs: Relationship between Educ_cat and my potential controls

# Relationship between EDU and Experience with Online hostile Engagement
crosstab(data$exp_hostile_engagement_cat, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## =======================================================================
##                                    data$educ_cat
## data$exp_hostile_engagement_cat     High   Intermediate     Low   Total
## -----------------------------------------------------------------------
## Less experience                     241             77      69     387 
##                                    41.9%          32.9%   34.3%        
## -----------------------------------------------------------------------
## More experience                     334            157     132     623 
##                                    58.1%          67.1%   65.7%        
## -----------------------------------------------------------------------
## Total                               575            234     201    1010 
##                                    56.9%          23.2%   19.9%        
## =======================================================================
crosstab(data$exp_hate_speech_cat, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ================================================================
##                             data$educ_cat
## data$exp_hate_speech_cat     High   Intermediate     Low   Total
## ----------------------------------------------------------------
## Less experience              307            125     113     545 
##                             53.3%          53.4%   55.9%        
## ----------------------------------------------------------------
## More experience              269            109      89     467 
##                             46.7%          46.6%   44.1%        
## ----------------------------------------------------------------
## Total                        576            234     202    1012 
##                             56.9%          23.1%   20.0%        
## ================================================================
crosstab(data$polinterest_cat_3, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ==============================================================
##                           data$educ_cat
## data$polinterest_cat_3     High   Intermediate     Low   Total
## --------------------------------------------------------------
## High                       256             93      88     437 
##                           44.4%          39.7%   43.6%        
## --------------------------------------------------------------
## Intermediate               227            110      76     413 
##                           39.4%          47.0%   37.6%        
## --------------------------------------------------------------
## Low                         93             31      38     162 
##                           16.1%          13.2%   18.8%        
## --------------------------------------------------------------
## Total                      576            234     202    1012 
##                           56.9%          23.1%   20.0%        
## ==============================================================
crosstab(data$polinterest, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ========================================================
##                     data$educ_cat
## data$polinterest     High   Intermediate     Low   Total
## --------------------------------------------------------
## 1                     32              8       9      49 
##                      5.6%           3.4%    4.5%        
## --------------------------------------------------------
## 2                     61             23      29     113 
##                     10.6%           9.8%   14.4%        
## --------------------------------------------------------
## 3                    227            110      76     413 
##                     39.4%          47.0%   37.6%        
## --------------------------------------------------------
## 4                    256             93      88     437 
##                     44.4%          39.7%   43.6%        
## --------------------------------------------------------
## Total                576            234     202    1012 
##                     56.9%          23.1%   20.0%        
## ========================================================
crosstab(data$empathy_pc_cat, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ===========================================================
##                        data$educ_cat
## data$empathy_pc_cat     High   Intermediate     Low   Total
## -----------------------------------------------------------
## Less empathetic         322            121      91     534 
##                        56.3%          51.7%   45.3%        
## -----------------------------------------------------------
## More empathetic         250            113     110     473 
##                        43.7%          48.3%   54.7%        
## -----------------------------------------------------------
## Total                   572            234     201    1007 
##                        56.8%          23.2%   20.0%        
## ===========================================================
crosstab(data$leftright, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ======================================================
##                   data$educ_cat
## data$leftright     High   Intermediate     Low   Total
## ------------------------------------------------------
## 1                   25             10      12      47 
##                    4.3%           4.3%    5.9%        
## ------------------------------------------------------
## 2                   19              9       9      37 
##                    3.3%           3.8%    4.5%        
## ------------------------------------------------------
## 3                   61             11      17      89 
##                   10.6%           4.7%    8.4%        
## ------------------------------------------------------
## 4                   44             24      12      80 
##                    7.6%          10.3%    5.9%        
## ------------------------------------------------------
## 5                   62             16      22     100 
##                   10.8%           6.8%   10.9%        
## ------------------------------------------------------
## 6                  131             45      47     223 
##                   22.7%          19.2%   23.3%        
## ------------------------------------------------------
## 7                   66             33      29     128 
##                   11.5%          14.1%   14.4%        
## ------------------------------------------------------
## 8                   68             30      23     121 
##                   11.8%          12.8%   11.4%        
## ------------------------------------------------------
## 9                   47             27      11      85 
##                    8.2%          11.5%    5.4%        
## ------------------------------------------------------
## 10                  15             15       9      39 
##                    2.6%           6.4%    4.5%        
## ------------------------------------------------------
## 11                  38             14      11      63 
##                    6.6%           6.0%    5.4%        
## ------------------------------------------------------
## Total              576            234     202    1012 
##                   56.9%          23.1%   20.0%        
## ======================================================

3.4 Relationship between control variables

# Function to plot relationships between all pairs of variables in a dataframe
plot_relationships <- function(df) {
  # Initialize list to store plots
  plots <- list()
  plot_count <- 1
  col_names <- names(df)
  
  # Loop through all unique pairs of variables
  for (i in seq_len(ncol(df) - 1)) { # Exclude the last column 'gender' from the x-axis variables
    for (j in (i+1):ncol(df)) {
      x <- col_names[i]
      y <- col_names[j]
      
      # Create scatter plot for each pair
      plot <- ggplot(df, aes_string(x = x, y = y)) + 
        geom_point(position = position_jitter(width = 0.2, height = 0.2), alpha = 0.6, color = "skyblue") +
        geom_smooth(method = "lm", colour = "black", linewidth = 0.5) +
        theme_minimal() +
        labs(title = paste("Scatter plot between", x, "and", y))
      
      # Store the plot in the list
      plots[[plot_count]] <- plot
      plot_count <- plot_count + 1
    }
  }
  
  # Print plots
  lapply(plots, print)
}

3.4.1 Create scatter plots for categorical measurement

# use only categorical variables
controls_cat <- controls_cat %>% select(-academic_status)

plot_relationships(controls_cat)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[9]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[10]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[11]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[12]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[13]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[14]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[15]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[16]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[17]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[18]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[19]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[20]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[21]]
## `geom_smooth()` using formula = 'y ~ x'

3.4.2 Create scatter plots for numerical measurement

controls_num <- controls_num %>% select(-academic_status)

plot_relationships(controls_num)
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[9]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[10]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[11]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[12]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[13]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[14]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[15]]
## `geom_smooth()` using formula = 'y ~ x'